1   package org.apache.lucene.search.vectorhighlight;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import org.apache.lucene.analysis.MockAnalyzer;
21  import org.apache.lucene.document.Document;
22  import org.apache.lucene.document.Field;
23  import org.apache.lucene.document.FieldType;
24  import org.apache.lucene.document.TextField;
25  import org.apache.lucene.index.DirectoryReader;
26  import org.apache.lucene.index.IndexReader;
27  import org.apache.lucene.index.IndexWriter;
28  import org.apache.lucene.index.IndexWriterConfig;
29  import org.apache.lucene.index.IndexWriterConfig.OpenMode;
30  import org.apache.lucene.index.RandomIndexWriter;
31  import org.apache.lucene.index.Term;
32  import org.apache.lucene.search.BooleanClause;
33  import org.apache.lucene.search.BooleanQuery;
34  import org.apache.lucene.search.Query;
35  import org.apache.lucene.search.TermQuery;
36  import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
37  import org.apache.lucene.store.Directory;
38  import org.apache.lucene.util.TestUtil;
39  
40  import java.util.ArrayList;
41  import java.util.HashMap;
42  import java.util.HashSet;
43  import java.util.List;
44  import java.util.Map;
45  import java.util.Set;
46  
47  public class SimpleFragmentsBuilderTest extends AbstractTestCase {
48    
49    public void test1TermIndex() throws Exception {
50      FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a" );
51      SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
52      assertEquals( "<b>a</b>", sfb.createFragment( reader, 0, F, ffl ) );
53  
54      // change tags
55      sfb = new SimpleFragmentsBuilder( new String[]{ "[" }, new String[]{ "]" } );
56      assertEquals( "[a]", sfb.createFragment( reader, 0, F, ffl ) );
57    }
58    
59    public void test2Frags() throws Exception {
60      FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "a b b b b b b b b b b b a b a b" );
61      SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
62      String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
63      // 3 snippets requested, but should be 2
64      assertEquals( 2, f.length );
65      assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
66      assertEquals( "b b <b>a</b> b <b>a</b> b", f[1] );
67    }
68    
69    public void test3Frags() throws Exception {
70      BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
71      booleanQuery.add(new TermQuery(new Term(F, "a")), BooleanClause.Occur.SHOULD);
72      booleanQuery.add(new TermQuery(new Term(F, "c")), BooleanClause.Occur.SHOULD);
73      
74      FieldFragList ffl = ffl(booleanQuery.build(), "a b b b b b b b b b b b a b a b b b b b c a a b b" );
75      SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
76      String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
77      assertEquals( 3, f.length );
78      assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
79      assertEquals( "b b <b>a</b> b <b>a</b> b b b b b c", f[1] );
80      assertEquals( "<b>c</b> <b>a</b> <b>a</b> b b", f[2] );
81    }
82    
83    public void testTagsAndEncoder() throws Exception {
84      FieldFragList ffl = ffl(new TermQuery(new Term(F, "a")), "<h1> a </h1>" );
85      SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
86      String[] preTags = { "[" };
87      String[] postTags = { "]" };
88      assertEquals( "&lt;h1&gt; [a] &lt;&#x2F;h1&gt;",
89          sfb.createFragment( reader, 0, F, ffl, preTags, postTags, new SimpleHTMLEncoder() ) );
90    }
91  
92    private FieldFragList ffl(Query query, String indexValue ) throws Exception {
93      make1d1fIndex( indexValue );
94      FieldQuery fq = new FieldQuery( query, true, true );
95      FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
96      FieldPhraseList fpl = new FieldPhraseList( stack, fq );
97      return new SimpleFragListBuilder().createFieldFragList( fpl, 20 );
98    }
99    
100   public void test1PhraseShortMV() throws Exception {
101     makeIndexShortMV();
102 
103     FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
104     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
105     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
106     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
107     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
108     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
109     // Should we probably be trimming?
110     assertEquals( "  a b c  <b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
111   }
112   
113   public void test1PhraseLongMV() throws Exception {
114     makeIndexLongMV();
115 
116     FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true );
117     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
118     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
119     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
120     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
121     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
122     assertEquals( "customization: The most <b>search engines</b> use only one of these methods. Even the <b>search engines</b> that says they can",
123         sfb.createFragment( reader, 0, F, ffl ) );
124   }
125 
126   public void test1PhraseLongMVB() throws Exception {
127     makeIndexLongMVB();
128 
129     FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed"
130     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
131     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
132     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
133     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
134     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
135     assertEquals( "additional hardware. \nWhen you talk about processing <b>speed</b>, the", sfb.createFragment( reader, 0, F, ffl ) );
136   }
137   
138   public void testUnstoredField() throws Exception {
139     makeUnstoredIndex();
140 
141     FieldQuery fq = new FieldQuery( tq( "aaa" ), true, true );
142     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
143     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
144     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
145     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
146     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
147     assertNull( sfb.createFragment( reader, 0, F, ffl ) );
148   }
149   
150   protected void makeUnstoredIndex() throws Exception {
151     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzerW).setOpenMode(OpenMode.CREATE));
152     Document doc = new Document();
153     FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
154     customType.setStoreTermVectors(true);
155     customType.setStoreTermVectorOffsets(true);
156     customType.setStoreTermVectorPositions(true);
157     doc.add( new Field( F, "aaa", customType) );
158     //doc.add( new Field( F, "aaa", Store.NO, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
159     writer.addDocument( doc );
160     writer.close();
161     if (reader != null) reader.close();
162     reader = DirectoryReader.open(dir);
163   }
164   
165   public void test1StrMV() throws Exception {
166     makeIndexStrMV();
167 
168     FieldQuery fq = new FieldQuery( tq( "defg" ), true, true );
169     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
170     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
171     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
172     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
173     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
174     sfb.setMultiValuedSeparator( '/' );
175     assertEquals( "abc/<b>defg</b>/hijkl", sfb.createFragment( reader, 0, F, ffl ) );
176   }
177   
178   public void testMVSeparator() throws Exception {
179     makeIndexShortMV();
180 
181     FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
182     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
183     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
184     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
185     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
186     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
187     sfb.setMultiValuedSeparator( '/' );
188     assertEquals( "//a b c//<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
189   }
190 
191   public void testDiscreteMultiValueHighlighting() throws Exception {
192     makeIndexShortMV();
193 
194     FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
195     FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
196     FieldPhraseList fpl = new FieldPhraseList( stack, fq );
197     SimpleFragListBuilder sflb = new SimpleFragListBuilder();
198     FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
199     SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
200     sfb.setDiscreteMultiValueHighlighting(true);
201     assertEquals( "<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
202 
203     make1dmfIndex("some text to highlight", "highlight other text");
204     fq = new FieldQuery( tq( "text" ), true, true );
205     stack = new FieldTermStack( reader, 0, F, fq );
206     fpl = new FieldPhraseList( stack, fq );
207     sflb = new SimpleFragListBuilder();
208     ffl = sflb.createFieldFragList( fpl, 32 );
209     String[] result = sfb.createFragments(reader, 0, F, ffl, 3);
210     assertEquals(2, result.length);
211     assertEquals("some <b>text</b> to highlight", result[0]);
212     assertEquals("highlight other <b>text</b>", result[1]);
213 
214     fq = new FieldQuery( tq( "highlight" ), true, true );
215     stack = new FieldTermStack( reader, 0, F, fq );
216     fpl = new FieldPhraseList( stack, fq );
217     sflb = new SimpleFragListBuilder();
218     ffl = sflb.createFieldFragList( fpl, 32 );
219     result = sfb.createFragments(reader, 0, F, ffl, 3);
220     assertEquals(2, result.length);
221     assertEquals("text to <b>highlight</b>", result[0]);
222     assertEquals("<b>highlight</b> other text", result[1]);
223   }
224 
225   public void testRandomDiscreteMultiValueHighlighting() throws Exception {
226     String[] randomValues = new String[3 + random().nextInt(10 * RANDOM_MULTIPLIER)];
227     for (int i = 0; i < randomValues.length; i++) {
228       String randomValue;
229       do {
230         randomValue = TestUtil.randomSimpleString(random());
231       } while ("".equals(randomValue));
232       randomValues[i] = randomValue;
233     }
234 
235     Directory dir = newDirectory();
236     RandomIndexWriter writer = new RandomIndexWriter(
237         random(),
238         dir,
239         newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
240 
241     FieldType customType = new FieldType(TextField.TYPE_STORED);
242     customType.setStoreTermVectors(true);
243     customType.setStoreTermVectorOffsets(true);
244     customType.setStoreTermVectorPositions(true);
245 
246     int numDocs = randomValues.length * 5;
247     int numFields = 2 + random().nextInt(5);
248     int numTerms = 2 + random().nextInt(3);
249     List<Doc> docs = new ArrayList<>(numDocs);
250     List<Document> documents = new ArrayList<>(numDocs);
251     Map<String, Set<Integer>> valueToDocId = new HashMap<>();
252     for (int i = 0; i < numDocs; i++) {
253       Document document = new Document();
254       String[][] fields = new String[numFields][numTerms];
255       for (int j = 0; j < numFields; j++) {
256         String[] fieldValues = new String[numTerms];
257         fieldValues[0] = getRandomValue(randomValues, valueToDocId, i);
258         StringBuilder builder = new StringBuilder(fieldValues[0]);
259         for (int k = 1; k < numTerms; k++) {
260           fieldValues[k] = getRandomValue(randomValues, valueToDocId, i);
261           builder.append(' ').append(fieldValues[k]);
262         }
263         document.add(new Field(F, builder.toString(), customType));
264         fields[j] = fieldValues;
265       }
266       docs.add(new Doc(fields));
267       documents.add(document);
268     }
269     writer.addDocuments(documents);
270     writer.close();
271     IndexReader reader = DirectoryReader.open(dir);
272 
273     try {
274       int highlightIters = 1 + random().nextInt(120 * RANDOM_MULTIPLIER);
275       for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) {
276         String queryTerm = randomValues[random().nextInt(randomValues.length)];
277         int randomHit = valueToDocId.get(queryTerm).iterator().next();
278         List<StringBuilder> builders = new ArrayList<>();
279         for (String[] fieldValues : docs.get(randomHit).fieldValues) {
280           StringBuilder builder = new StringBuilder();
281           boolean hit = false;
282           for (int i = 0; i < fieldValues.length; i++) {
283             if (queryTerm.equals(fieldValues[i])) {
284               builder.append("<b>").append(queryTerm).append("</b>");
285               hit = true;
286             } else {
287               builder.append(fieldValues[i]);
288             }
289             if (i != fieldValues.length - 1) {
290               builder.append(' ');
291             }
292           }
293           if (hit) {
294             builders.add(builder);
295           }
296         }
297 
298         FieldQuery fq = new FieldQuery(tq(queryTerm), true, true);
299         FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq);
300 
301         FieldPhraseList fpl = new FieldPhraseList(stack, fq);
302         SimpleFragListBuilder sflb = new SimpleFragListBuilder(100);
303         FieldFragList ffl = sflb.createFieldFragList(fpl, 300);
304 
305         SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
306         sfb.setDiscreteMultiValueHighlighting(true);
307         String[] actualFragments = sfb.createFragments(reader, randomHit, F, ffl, numFields);
308         assertEquals(builders.size(), actualFragments.length);
309         for (int i = 0; i < actualFragments.length; i++) {
310           assertEquals(builders.get(i).toString(), actualFragments[i]);
311         }
312       }
313     } finally {
314       reader.close();
315       dir.close();
316     }
317   }
318 
319   private String getRandomValue(String[] randomValues, Map<String, Set<Integer>> valueToDocId, int docId) {
320     String value = randomValues[random().nextInt(randomValues.length)];
321     if (!valueToDocId.containsKey(value)) {
322       valueToDocId.put(value, new HashSet<Integer>());
323     }
324     valueToDocId.get(value).add(docId);
325     return value;
326   }
327 
328   private static class Doc {
329 
330     final String[][] fieldValues;
331 
332     private Doc(String[][] fieldValues) {
333       this.fieldValues = fieldValues;
334     }
335   }
336 
337 }